Install and load awst

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("drisso/awst")
library(readr)
library(awst)
library(EDASeq)
library(Rtsne)
library(umap)
library(dendextend)

Data import and cleaning

The data are available on GEO with the following accession number.

In the following chunks of code, we arrange a data-frame where each of the cell are annotated, and the annotations are coded as colors.

We mostly follow the original article for data preprocessing.

ADT and annotation

if(!file.exists("annotation.RData")) {
  positive.col <-  "red"
  negative.col <- "gray"
  ttable <- read_csv("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE100866&format=file&file=GSE100866_PBMC_vs_flow_10X-ADT_umi.csv.gz")
  ttable <- as.data.frame(ttable)
  rownames(ttable) <- ttable[,1]
  ttable <- ttable[,-1]
  ttable <- 1 + t(ttable)
  tmp <- exp(log(apply(ttable, 1, prod))/ncol(ttable))
  ttable <- log(ttable/tmp)
  ttable <- 6 * (pnorm(ttable) - 0.5)
  annotation.df <- as.data.frame(ttable)
  annotation.df$cell <- rownames(annotation.df)
  nBins <- 9
  bbreaks <- seq(-3, 3, length.out = nBins + 1)
  levels_cols <- c("green4", "green3", "green2", "green", "gray75", "red", "red2", "red3", "red4")
  j <- 0
  while(j < ncol(ttable)) {
    j <- j + 1
    annotation.df$tmp <- cut(ttable[, j], breaks = bbreaks, include.lowest = TRUE)
    levels(annotation.df$tmp) <- levels_cols
    colnames(annotation.df)[ncol(annotation.df)] <- paste0(colnames(ttable)[j], ".col")
  }
  save(annotation.df, file = "annotation.RData")
} else {
  load("annotation.RData")
}

Single-cell RNA-seq

Before applying normalization and AWST, we filtered out cells that did not met the following criteria:

  • remove the genes detected in no cells.

  • filter out cells not having at least 500 observed features.

  • filter out cells not having at least 20 different intensities across the features.

if(!file.exists("Level3.RData")) {
ddata <- read_csv("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE100866&format=file&file=GSE100866_PBMC_vs_flow_10X-RNA_umi.csv.gz")

#restrict the features to the HUMAN ones
ddata <- ddata[grep("^HUMAN_", ddata$X1),]
ddata <- as.data.frame(ddata)
rownames(ddata) <- gsub("HUMAN_", "", ddata$X1)
ddata <- as.matrix(ddata[,-1])
save(ddata, file = "Level3.RData")
} else {
  load("Level3.RData")
}

Apply AWST

if(!file.exists("expression.RData")) {
  ddata <- t(ddata)
  ###
  no_of_detected_gene_per_sample <- rowSums(ddata > 0) 
  fivenum(no_of_detected_gene_per_sample)#  10      638      739      873         4833 
  # restrict the collection of cells to those cells having at least 500 observed features
  sum(no_of_detected_gene_per_sample > 500) # 7613
  dim(ddata <- ddata[no_of_detected_gene_per_sample > 500,])#[1]  7613 17014
  ###
  no_of_different_intensities <- apply(ddata, 1, function(x) length(table(x)))
  fivenum(no_of_different_intensities)#  9     17   20     24    105 
  # restrict the collection of cells to those cells having at least 20 different intensities across the featutures
  dim(ddata <- ddata[which(no_of_different_intensities >= 20), ])#[1] 4123 17014
  ###
  # apply the full quantile normalization 
  normCounts <- EDASeq::betweenLaneNormalization(t(ddata), which = "full", round = FALSE)
  save(normCounts, file = "normCounts.RData")
  ###
  # apply the AWS-transformation 
  library(awst)
  dim(exprData <- awst(normCounts, poscount = TRUE, full_quantile = TRUE)) #[1]  4123 17014
  dim(exprData <- gene_filter(exprData, nBins = 30, heterogeneity_threshold = 0.05)) #[1] 4123  330
  save(exprData, file = "expression.RData")
} else {
  load("expression.RData")
}

Clustering and dimensionality reduction

if(!file.exists("expression_umap_2d.RData")) {
nrow_exprData <- nrow(exprData)
ncol_exprData <- ncol(exprData)
ddist <- dist(exprData)
save(ddist, nrow_exprData, ncol_exprData, file = "expression_dist.RData")

hhc <- hclust(ddist, method = "ward.D2")
save(hhc, nrow_exprData, ncol_exprData, file = "expression_dist_hclust.RData")

pprcomp <- prcomp(exprData)
pprcomp$x <- pprcomp$x[, 1:10]
pprcomp$rotation <- pprcomp$rotation[, 1:10]
save(pprcomp, file = "expression_prcomp.RData")

set.seed(2019) # needed to get the figure in the paper
ans_Rtsne <- Rtsne(exprData, pca = FALSE, perplexity = 250) # Run TSNE
save(ans_Rtsne, file = "expression_Rtsne_2d.RData")

set.seed(2019) # needed to get the figure in the paper
ans_umap <- umap(exprData)
save(ans_umap, file = "expression_umap_2d.RData")
}
load("annotation.RData")
load("expression_dist_hclust.RData")
annotation.df <- annotation.df[hhc$labels,]
###############
save_plots <- FALSE
png_width_large <- 1500
png_height_large <- 750
png_width_small <- width_png <- 700
png_height_small <- 700
png_res <- 1/300
###################
color.bar2 <- function(x_pos, y_pos, lut, min, max=-min, nticks=11, ticks=seq(min, max, len=nticks), title='', values = NULL) {
    scale = (max-min)/length(lut)*0.3
    for (i in 1:length(lut)) {
      y_low  <- (i-1)*scale + min + y_pos
        y_high <-  y_low + scale
        rect(x_pos,y_low,x_pos+.05,y_high, col=lut[i], border=NA)
        text(x_pos+.05, (y_low + y_high)/2, values[i], adj = -0.1)
    }   
}
vvalues <- c("-3.0", "-2.0", "-1.3", "-0.6", " 0.0", " 0.6", " 1.3", " 2.0", " 3.0")
ffill2 <- names(table(annotation.df$CD3.col))

Main Clustering

clustering.prefix <- "CBMC"; short.prefix <- "CBMC"
clustering.df <- data.frame(cell = annotation.df$cell)
rownames(clustering.df) <- clustering.df$cell

############
mmain  <-  paste0("CBMC study (", nrow_exprData, " cells/", ncol_exprData, " genes)")
if(save_plots) {
  mmain <- ""; png("main_dendrogram.png", width= png_width_large, height= png_height_large, res = png_res)
}
hhc$height <- hhc$height/max(hhc$height)

plot(hhc, hang = -1, labels = FALSE, xlab = "", sub = "", main = mmain)
###
wwhere <- 10
hh <- mean(c(hhc$height[length(hhc$height)-wwhere+2], hhc$height[length(hhc$height)-wwhere+1]))

tmp <- tmp_ <- as.factor(cutree(hhc, k = wwhere))

levels(tmp) <- c( "1", "1", "3", "4", "5", "6", "6", "8","4", "5")
wwhere <- length(unique(levels(tmp)))
clusteringWhere <- paste0(clustering.prefix, wwhere)
clusteringWhere.col <- paste0(clusteringWhere, ".col")
assign(clusteringWhere.col, tmp)
if(wwhere < 10) levels(tmp) <- paste0(short.prefix, wwhere, 1:wwhere) else levels(tmp) <- paste0(short.prefix, wwhere, c(paste(1:9), letters[1:(wwhere-9)]))
assign(clusteringWhere, tmp)
levels(tmp) <- c("black", "red", "green3", "blue", "cyan", "magenta")

assign(clusteringWhere.col, tmp)
tt <- table(get(clusteringWhere), get(clusteringWhere.col))
colorCode <- colnames(tt)[apply(tt, 1, which.max)]
names(colorCode) <- rownames(tt)
assign(paste0(clusteringWhere, ".colorCode"), colorCode)
clust.colorCode <- colorCode
clustering.df$tmp <- get(clusteringWhere)
clustering.df$tmp.explanatory <- clustering.df$tmp
clustering.df$tmp.col <- get(clusteringWhere.col)
ncol_ <- ncol(clustering.df)
colnames(clustering.df)[(ncol_-2):ncol_] <- c(clusteringWhere, paste0(clusteringWhere, ".explanatory"), clusteringWhere.col)
levels(clustering.df[, paste0(clusteringWhere, ".explanatory")]) <- c("CBMC1 - T Cell", "CBMC2 - B Cell", "CBMC3 - unclear", "CBMC4 - Monocyte", "CBMC5 - myeloid DC", "CBMC6 - plasmacytoid DC")

annotation.col <- annotation.df[, grep(".col", colnames(annotation.df))]
colnames(annotation.col) <- gsub(".col", "", colnames(annotation.col))
annotation.col <- annotation.col[, rev(c( "CD19", "CD3",  "CD11c",  "CD14","CD4","CD8", "CD2","CD57"))]
annotation.col$CBMC <- clustering.df[, ncol(clustering.df)]

colored_bars(colors = annotation.col, dend = as.dendrogram(hhc), y_scale = 0.17, y_shift = 0.015)
save(clustering.df, clust.colorCode, file = "clustering.RData")

tt <- table(clustering.df[, ncol(clustering.df)-1])
pct <- paste0(round(100*tt/sum(tt), 1), "%")
llegend <- paste(names(tt), " (", tt, "; ", pct, ")", sep = "")
tt <- table(clustering.df[, ncol(clustering.df)-1], clustering.df[, ncol(clustering.df)])
ffill <- colnames(tt)[apply(tt, 1, which.max)]
legend(700, .99, legend=llegend, fill = ffill, y.intersp = 1, box.col = "white", border = "white", title = "CBMC", title.adj = 0)

color.bar3 <- function(x_pos, y_pos, lut, min, max=-min, nticks=11, ticks=seq(min, max, len=nticks), title='', values = NULL) {
    scale = (max-min)/length(lut)*0.3
    for (i in 1:length(lut)) {
      y_low  <- (i-1)*scale + y_pos
        y_high <-  y_low + scale
        rect(x_pos,y_low,x_pos+90,y_high, col=lut[i], border=NA)
        text(x_pos+.05, (y_low + y_high)/2, values[i], adj = -2)
    }   
}
color.bar3(4000, 0.65, ffill2, -0.6, values = vvalues)
text(4045, 0.63, "Marker's level")
text(1, 1, "(a)")

PCA (AWST) | ADT

x_legend <- -0.2; y_legend <- 2.63
x_text <- -2.3; y_text <- 2.
load("expression_prcomp.RData")
pprcomp$x <- scale(pprcomp$x)
mmain = paste0("Principal components analysis (", nrow_exprData, " cells/", ncol_exprData, " features)")
if(save_plots) png("prcomp_CITE6.png", width= png_width_small, height= png_height_small, res = png_res)
plot(pprcomp$x, col = clustering.df$CBMC6.col, main = mmain, 
     xlab = "first principal component", ylab = "secondo principal component", pch = 19)
legend(x_legend, y_legend, legend=llegend, fill = ffill, y.intersp = 1, box.col = "white", border = "white", title = "CBMC", title.adj = 0)
text(x_text, y_text, "(b)")

###############################

mmain <-  "principal component analysis - CD3"#  "prcomp (AWST)| flow cytometry/CD3"
cat(sprintf("\n\n### %s\n\n", mmain))

principal component analysis - CD3

if(save_plots) png(file = "prcomp_CD3.png", width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD3.col), main = mmain, 
     xlab = "first principal component", ylab = "secondo principal component", pch = 19)
ffill2 <- names(table(annotation.df$CD3.col))
color.bar2(-2.3, -0.7, ffill2, -3, values = vvalues)
text(x_text, y_text, "(a)")

mmain <-  "principal component analysis - CD19"#  "prcomp (AWST)| flow cytometry/CD19"
cat(sprintf("\n\n### %s\n\n", mmain))

principal component analysis - CD19

if(save_plots) png(file = "prcomp_CD19.png", width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD19.col), main = mmain, 
     xlab = "first principal component", ylab = "secondo principal component", pch = 19)
color.bar2(-2.3, -0.7, ffill2, -3, values = vvalues)
text(x_text, y_text, "(b)")

mmain <-  "principal component analysis - CD11c"#  "prcomp (AWST)| flow cytometry/CD11c"
cat(sprintf("\n\n### %s\n\n", mmain))

principal component analysis - CD11c

if(save_plots) png(file = "prcomp_CD11c.png", width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD11c.col), main = mmain, 
     xlab = "first principal component", ylab = "secondo principal component", pch = 19)
color.bar2(-2.3, -0.7, ffill2, -3, values = vvalues)
text(x_text, y_text, "(c)")

mmain <-  "principal component analysis - CD14"#  "prcomp (AWST)| flow cytometry/CD14"
cat(sprintf("\n\n### %s\n\n", mmain))

principal component analysis - CD14

if(save_plots) png(file = "prcomp_CD14.png", width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD14.col), main = mmain, 
     xlab = "first principal component", ylab = "secondo principal component", pch = 19)
color.bar2(-2.3, -0.7, ffill2, -3, values = vvalues)
text(x_text, y_text, "(d)")

mmain <-  "principal component analysis - CD4"#  "prcomp (AWST)| flow cytometry/CD4"
cat(sprintf("\n\n### %s\n\n", mmain))

principal component analysis - CD4

if(save_plots) png(file = "prcomp_CD4.png", width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD4.col), main = mmain, 
     xlab = "first principal component", ylab = "secondo principal component", pch = 19)
color.bar2(-2.3, -0.7, ffill2, -3, values = vvalues)
text(x_text, y_text, "(e)")

mmain <-  "principal component analysis - CD8"#  "prcomp (AWST)| flow cytometry/CD8"
cat(sprintf("\n\n### %s\n\n", mmain))

principal component analysis - CD8

if(save_plots) png(file = "prcomp_CD8.png", width = width_png, height = width_png)
plot(pprcomp$x, col = as.character(annotation.df$CD8.col), main = mmain, 
     xlab = "first principal component", ylab = "secondo principal component", pch = 19)
color.bar2(-2.3, -0.7, ffill2, -3, values = vvalues)
text(x_text, y_text, "(f)")

Rtsne p100 (AWST) | ADT

load("expression_Rtsne_2d.RData")
ans_Rtsne$Y <- scale(ans_Rtsne$Y)
if(save_plots) png("Rtsne.png", width= png_width_small, height= png_height_small, res = png_res)
mmain = "T-distributed Stochastic Neighbor Embedding (tsne)"
plot(-ans_Rtsne$Y[, 1], ans_Rtsne$Y[, 2], col = clustering.df$CBMC6.col, main = mmain, xlab = "", ylab = "")
legend(-1.65, -1.5, legend=llegend, fill = ffill, y.intersp = 1, box.col = "white", border = "white", title = "CBMC", title.adj = 0)
text(-1.65, 2, "(c)")

umap (AWST) | ADT

load("expression_umap_2d.RData")
ans_umap$layout <- scale(ans_umap$layout)
if(save_plots) png("umap.png", width= png_width_small, height= png_height_small, res = png_res)
mmain = "Uniform Manifold Approximation and Projection (umap)"
plot(-ans_umap$layout[, 2], -ans_umap$layout[, 1], col = clustering.df$CBMC6.col, main = mmain, xlab = "", ylab = "")
legend(-1.65, -1.5, legend=llegend, fill = ffill, y.intersp = 1, box.col = "white", border = "white", title = "CBMC", title.adj = 0)
text(-1.65, 1.5, "(d)")

Session info

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] dendextend_1.12.0           umap_0.2.3.1               
##  [3] Rtsne_0.15                  EDASeq_2.18.0              
##  [5] ShortRead_1.42.0            GenomicAlignments_1.20.1   
##  [7] SummarizedExperiment_1.14.1 DelayedArray_0.10.0        
##  [9] matrixStats_0.55.0          Rsamtools_2.0.2            
## [11] GenomicRanges_1.36.1        GenomeInfoDb_1.20.0        
## [13] Biostrings_2.52.0           XVector_0.24.0             
## [15] IRanges_2.18.3              S4Vectors_0.22.1           
## [17] BiocParallel_1.18.1         Biobase_2.44.0             
## [19] BiocGenerics_0.30.0         awst_0.0.3                 
## [21] readr_1.3.1                
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6           bit64_0.9-7            RColorBrewer_1.1-2    
##  [4] progress_1.2.2         httr_1.4.1             tools_3.6.1           
##  [7] backports_1.1.5        R6_2.4.0               DBI_1.0.0             
## [10] lazyeval_0.2.2         colorspace_1.4-1       gridExtra_2.3         
## [13] tidyselect_0.2.5       prettyunits_1.0.2      bit_1.1-14            
## [16] compiler_3.6.1         rtracklayer_1.44.4     scales_1.0.0          
## [19] genefilter_1.66.0      askpass_1.1            DESeq_1.36.0          
## [22] stringr_1.4.0          digest_0.6.21          rmarkdown_1.16        
## [25] R.utils_2.9.0          pkgconfig_2.0.3        htmltools_0.4.0       
## [28] rlang_0.4.0            RSQLite_2.1.2          hwriter_1.3.2         
## [31] jsonlite_1.6           dplyr_0.8.3            R.oo_1.22.0           
## [34] RCurl_1.95-4.12        magrittr_1.5           GenomeInfoDbData_1.2.1
## [37] Matrix_1.2-17          Rcpp_1.0.2             munsell_0.5.0         
## [40] reticulate_1.13        viridis_0.5.1          R.methodsS3_1.7.1     
## [43] stringi_1.4.3          yaml_2.2.0             zlibbioc_1.30.0       
## [46] grid_3.6.1             blob_1.2.0             crayon_1.3.4          
## [49] lattice_0.20-38        splines_3.6.1          GenomicFeatures_1.36.4
## [52] annotate_1.62.0        hms_0.5.1              zeallot_0.1.0         
## [55] knitr_1.25             pillar_1.4.2           geneplotter_1.62.0    
## [58] biomaRt_2.40.5         XML_3.98-1.20          glue_1.3.1            
## [61] evaluate_0.14          latticeExtra_0.6-28    vctrs_0.2.0           
## [64] gtable_0.3.0           openssl_1.4.1          purrr_0.3.2           
## [67] assertthat_0.2.1       ggplot2_3.2.1          xfun_0.10             
## [70] aroma.light_3.14.0     xtable_1.8-4           viridisLite_0.3.0     
## [73] survival_2.44-1.1      tibble_2.1.3           AnnotationDbi_1.46.1  
## [76] memoise_1.1.0